pacman::p_load(tidyverse, data.table, broom,latex2exp,ggpubr,kableExtra,R.utils)
rm(list=ls())
options("modelsummary_format_numeric_latex" = "plain")
################################################################################


################## TRASE

df_b_br = fread("../../data/agribusiness/clean/beef_brazil.csv.gz")
df_s_br = fread("../../data/agribusiness/clean/soybean_brazil.csv.gz")
df_m_br = fread("../../data/agribusiness/clean/maize_brazil.csv.gz")
df_s_ar = fread("../../data/agribusiness/clean/soybean_argentina.csv.gz")

df_b_pt = df_b_br[,c('product_type','fob_usd')][, lapply(.SD, sum), by=.(product_type)]
df_s_pt = df_s_br[,c('product_type','fob_usd')][, lapply(.SD, sum), by=.(product_type)]
df_m_pt = df_m_br[,c('product_type','fob_usd')][, lapply(.SD, sum), by=.(product_type)]
df_s_a_pt = df_s_ar[,c('product_type','fob_usd')][, lapply(.SD, sum), by=.(product_type)]


################## FAOSTAT

df = fread("../../data/trade/raw/exports_brazil_argentina.csv")[,c('Year','Area','Item','Element','Value')]
setnames(df, new=c('year','country','product_type','element','value'))
df$country=toupper(df$country)

df1 = df[element=='Export Quantity',][,-c('element')]
df2 = df[element=='Export Value',][,-c('element')]
df2$value = df2$value*1000
df = merge(df1,df2, c('year','country','product_type'))
setnames(df, old=c('value.x','value.y'), new=c('equivalent_tonnes','fob_usd'))

df[country=='BRAZIL' & product_type %in% c('Meat of cattle boneless, fresh or chilled'),]$product_type="BEEF BONELESS"
df[country=='BRAZIL' & product_type %in% c("Maize (corn)","Bran of maize","Cake of maize","Green corn (maize)","Flour of maize"),]$product_type="Corn equivalents"
df[country=='BRAZIL' & product_type %in% c("Soya beans","Cake of  soya beans","Soya bean oil")]$product_type="Soy bean equivalents"
df[country=='ARGENTINA' & product_type %in% c("Soya beans")]$product_type="SOYBEANS"
df[country=='ARGENTINA' & product_type %in% c("Cake of  soya beans")]$product_type="SOYBEAN CAKE"
df[country=='ARGENTINA' & product_type %in% c("Soya bean oil")]$product_type="SOYBEAN OIL"

prod_list = c("BEEF BONELESS","Corn equivalents","Soy bean equivalents","SOYBEANS","SOYBEAN CAKE","SOYBEAN OIL")
df = df[product_type %in% prod_list,][, lapply(.SD, sum), by=.(year,country,product_type)]
df$source='faostat'
df_faostat=df


################## FAOSTAT origin-destination data - major importers

df = fread("../../data/trade/clean/tradeflows_faostat.csv.gz")
setnames(df, old=c('destination_id','origin_id','X_njt','Q_njt'), new=c('destination','country','fob_usd','equivalent_tonnes'))
df$destination=toupper(df$destination)
df$country=toupper(df$country)
df$product_type=df$product
df$fob_usd = df$fob_usd*1000
df = df[,c('year','country','destination','product_type','fob_usd','equivalent_tonnes')]

df[country=='BRAZIL' & product_type %in% c("Meat, cattle, boneless (beef & veal)"),]$product_type="BEEF BONELESS"
df[country=='BRAZIL' & product_type %in% c("Maize","Cake, maize","Oil, maize","Maize, green"),]$product_type="Corn equivalents"
df[country=='BRAZIL' & product_type %in% c("Soybeans","Cake, soybeans","Oil, soybean")]$product_type="Soy bean equivalents"
df[country=='ARGENTINA' & product_type %in% c("Soybeans")]$product_type="SOYBEANS"
df[country=='ARGENTINA' & product_type %in% c("Cake, soybeans")]$product_type="SOYBEAN CAKE"
df[country=='ARGENTINA' & product_type %in% c("Oil, soybean")]$product_type="SOYBEAN OIL"

prod_list = c("BEEF BONELESS","Corn equivalents","Soy bean equivalents","SOYBEANS","SOYBEAN CAKE","SOYBEAN OIL")
df =  df[product_type %in% prod_list,][, lapply(.SD, sum), by=.(year,country,destination,product_type)]
df$source = 'faostat'
df_faostat_od = df


################## FAOSTAT origin-destination data - all importers

df = fread("../../data/trade/clean/imports_from_world_iv.csv.gz")
setnames(df, old=c('item','imports_from_argentina','imports_from_brazil'), new=c('product_type','ARGENTINA','BRAZIL'))
df = df[,c('year','destination','product_type','element','ARGENTINA','BRAZIL')]

df = melt(df, id.vars = c('year','destination','product_type','element'), variable.name = "country")
df1=df[element=='IMPORT QUANTITY',][,-c('element')]
df2=df[element=='IMPORT VALUE',][,-c('element')]
df=merge(df1,df2, by= c('year','country','destination','product_type'),)
setnames(df, old=c('value.x','value.y'), new=c('equivalent_tonnes','fob_usd'))

df[country=='BRAZIL' & product_type %in% c("MEAT, CATTLE, BONELESS (BEEF & VEAL)"),]$product_type="BEEF BONELESS"
df[country=='BRAZIL' & product_type %in% c("MAIZE"),]$product_type="Corn equivalents"
df[country=='BRAZIL' & product_type %in% c("SOYBEANS")]$product_type="Soy bean equivalents"
df[country=='ARGENTINA' & product_type %in% c("SOYBEANS")]$product_type="SOYBEANS"

prod_list = c("BEEF BONELESS","Corn equivalents","Soy bean equivalents","SOYBEANS")
df =  df[product_type %in% prod_list,][, lapply(.SD, sum), by=.(year,country,destination,product_type)]
df$source = 'faostat'
df_faostat_od_all = df


################## Validate origin export levels

df1 = df_b_br[, c('year','country','product_type','fob_usd','equivalent_tonnes')][, lapply(.SD, sum), by=.(year,country,product_type)]
df2 = df_s_br[,c('year','country','product_type','fob_usd','equivalent_tonnes')][, lapply(.SD, sum), by=.(year,country,product_type)]
df3 = df_m_br[destination!='BRAZIL',c('year','country','product_type','fob_usd','equivalent_tonnes')][, lapply(.SD, sum), by=.(year,country,product_type)]
df4 = df_s_ar[,c('year','country','product_type','fob_usd','equivalent_tonnes')][, lapply(.SD, sum), by=.(year,country,product_type)]
df = rbind(df1,df2,df3,df4)
df$source='trase'
df_trase = df

var_list = c("year","country","product_type","fob_usd","equivalent_tonnes","source")
df=merge(df_trase,df_faostat, by=c("year","country","product_type"))
df$ratio_value = df$fob_usd.x/ df$fob_usd.y 
df$ratio_quantities = df$equivalent_tonnes.x/df$equivalent_tonnes.y
df_both=df

#Figure
titlelab='Exports (million USD)'
xlab='values from TRASE'
ylab='values from FAOSTAT'
df$y_var=df$fob_usd.y/1000000
df$x_var=df$fob_usd.x/1000000
ols = lm(y_var ~ x_var, data=df)
intercept2 = round(summary(ols)$coefficients[1],3)
slope2 = round(summary(ols)$coefficients[2],3)
rsq2 = round(summary(ols)$r.squared,3)
fig_o = ggplot(data = df, aes(x = x_var, y = y_var)) +
  geom_abline(intercept = 0, slope = 1, linewidth = 1, linetype='dotted', color='gray')+
  geom_point(alpha = 0.5,size=2) +
  geom_smooth(formula = y ~ x, method='lm', se=F, linewidth=1, color='darkred') +
  theme_minimal() +
  labs(title=titlelab, x=xlab, y=ylab, subtitle=TeX(paste0("Slope=", slope2, ", R-sq=", rsq2)))+
  theme(axis.title.x = element_text(hjust = 1, size=8),
        axis.title.y = element_text(hjust=1, size=8),
        plot.title = element_text(size = 8.5),
        plot.subtitle = element_text(size = 8, color='darkred'),
        aspect.ratio = 1,
        panel.grid.minor = element_blank(),
        legend.position = 'none')


################## Validate bilateral export levels

df1 = df_b_br[, c('year','country','destination','product_type','fob_usd','equivalent_tonnes')][, lapply(.SD, sum), by=.(year,country,destination,product_type)]
df2 = df_s_br[, c('year','country','destination','product_type','fob_usd','equivalent_tonnes')][, lapply(.SD, sum), by=.(year,country,destination,product_type)]
df3 = df_m_br[destination!='BRAZIL', c('year','country','destination','product_type','fob_usd','equivalent_tonnes')][, lapply(.SD, sum), by=.(year,country,destination,product_type)]
df4 = df_s_ar[, c('year','country','destination','product_type','fob_usd','equivalent_tonnes')][, lapply(.SD, sum), by=.(year,country,destination,product_type)]
df = rbind(df1,df2,df3,df4)
df$source = 'trase'
df_trase_od = df

list_trase = data.frame(unique(df_trase_od$destination))
list_faostat = data.frame(unique(df_faostat_od$destination))

df=merge(df_trase_od,df_faostat_od, by=c("year","country","destination","product_type"))
df$ratio_value = df$fob_usd.x/ df$fob_usd.y 
df$ratio_quantities = df$equivalent_tonnes.x/df$equivalent_tonnes.y
df_both_od=df

#Figure
titlelab='Bilateral trade flows (million USD)'
xlab='values from TRASE'
ylab='values from FAOSTAT'
df$y_var=df$fob_usd.y/1000000
df$x_var=df$fob_usd.x/1000000
ols = lm(y_var ~ x_var, data=df)
intercept2 = round(summary(ols)$coefficients[1],3)
slope2 = round(summary(ols)$coefficients[2],3)
rsq2 = round(summary(ols)$r.squared,3)
fig_od = ggplot(data = df, aes(x = x_var, y = y_var)) +
  geom_abline(intercept = 0, slope = 1, linewidth = 1, linetype='dotted', color='gray')+
  geom_point(alpha = 0.5, size=2) +
  geom_smooth(formula = y ~ x, method='lm', se=F, linewidth=1, color='darkred') +
  theme_minimal() +
  labs(title=titlelab, x=xlab, y=ylab, subtitle=TeX(paste0("Slope=", slope2, ", R-sq=", rsq2)))+
  theme(axis.title.x = element_text(hjust = 1, size=8),
        axis.title.y = element_text(hjust=1, size=8),
        plot.title = element_text(size = 8.5),
        plot.subtitle = element_text(size = 8, color='darkred'),
        aspect.ratio = 1,
        panel.grid.minor = element_blank(),
        legend.position = 'none')

ggarrange(fig_o, fig_od, ncol=2)
ggsave(paste0("../../output/figures/appendix/figure1_validation.pdf"), width=6.25, height=3.25)
